# Regression. Partial F-tests and Lack-of-Fit Tests
# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627

import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import os

# Load data
os.chdir("C:\\Users\\baron\\Documents\\Teach\\627 Statistical Machine Learning\\Data")  # Change the working directory
auto = pd.read_csv("Auto.csv")  # Read the data file in the CSV format

# 1. PARTIAL F-TEST

# Full model
auto['horsepower'] = pd.to_numeric(auto['horsepower'], errors='coerce')
reg_full = smf.ols('mpg ~ year + acceleration + horsepower + weight', data=auto).fit()
print(reg_full.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    mpg   R-squared:                       0.809
Model:                            OLS   Adj. R-squared:                  0.807
Method:                 Least Squares   F-statistic:                     408.8
Date:                Thu, 01 Aug 2024   Prob (F-statistic):          1.72e-137
Time:                        18:23:43   Log-Likelihood:                -1037.1
No. Observations:                 392   AIC:                             2084.
Df Residuals:                     387   BIC:                             2104.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
================================================================================
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      -15.3889      4.671     -3.294      0.001     -24.574      -6.204
year             0.7511      0.052     14.381      0.000       0.648       0.854
acceleration     0.0802      0.100      0.803      0.422      -0.116       0.277
horsepower       0.0026      0.013      0.196      0.845      -0.024       0.029
weight          -0.0066      0.000    -14.099      0.000      -0.008      -0.006
==============================================================================
Omnibus:                       38.374   Durbin-Watson:                   1.223
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               60.816
Skew:                           0.639   Prob(JB):                     6.22e-14
Kurtosis:                       4.446   Cond. No.                     8.35e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 8.35e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

# Reduced model
reg_reduced = smf.ols('mpg ~ horsepower + weight', data=auto).fit()
print(reg_reduced.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    mpg   R-squared:                       0.706
Model:                            OLS   Adj. R-squared:                  0.705
Method:                 Least Squares   F-statistic:                     467.9
Date:                Thu, 01 Aug 2024   Prob (F-statistic):          3.06e-104
Time:                        18:24:16   Log-Likelihood:                -1121.0
No. Observations:                 392   AIC:                             2248.
Df Residuals:                     389   BIC:                             2260.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     45.6402      0.793     57.540      0.000      44.081      47.200
horsepower    -0.0473      0.011     -4.267      0.000      -0.069      -0.026
weight        -0.0058      0.001    -11.535      0.000      -0.007      -0.005
==============================================================================
Omnibus:                       35.336   Durbin-Watson:                   0.858
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               45.973
Skew:                           0.683   Prob(JB):                     1.04e-10
Kurtosis:                       3.974   Cond. No.                     1.15e+04
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.15e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

# ANOVA.
# This partial F-test shows if adding variables 'year' and 'acceleration' into the model is significant.
anova_results = sm.stats.anova_lm(reg_reduced, reg_full)
print(anova_results)

   df_resid          ssr  df_diff      ss_diff           F        Pr(>F)
0     389.0  6993.845437      0.0          NaN         NaN           NaN
1     387.0  4558.049951      2.0  2435.795486  103.405279  1.050026e-36

# The p-value comparing these two models is very significant, so the two variables make a significant 
# contribution for the prediction of mpg, in addition of weight and horsepower.

# 2. LACK-OF-FIT TEST
# Here we test linearity by comparing the linear model (reduced) with the model with dummy variables, 
# one for each value of X (full model that does not assume linearity).

# Reduced model
reg_reduced = smf.ols('mpg ~ cylinders', data=auto).fit()

# Full model with dummy variables
auto['cyl_categorical'] = auto['cylinders'].astype('category')
reg_full = smf.ols('mpg ~ C(cyl_categorical)', data=auto).fit()

# ANOVA
anova_results = sm.stats.anova_lm(reg_reduced, reg_full)
print(anova_results)

   df_resid          ssr  df_diff     ss_diff          F        Pr(>F)
0     395.0  9638.365011      0.0         NaN        NaN           NaN
1     392.0  8758.095487      3.0  880.269523  13.133207  3.450919e-08

# Low p-value shows that the relation of mpg to the number of cylinders is non-linear.

# Continuous case – what to do if the X-variable has no repeated values?
# Rounding horsepower to the nearest 10:
auto['hp_rounded'] = auto['horsepower'].round(-1)

# Reduced model
reg_reduced = smf.ols('mpg ~ horsepower', data=auto).fit()

# Full model with dummy variables
auto['hp_rounded'] = auto['hp_rounded'].astype('category')
reg_full = smf.ols('mpg ~ C(hp_rounded)', data=auto).fit()

# ANOVA
anova_results = sm.stats.anova_lm(reg_reduced, reg_full)
print(anova_results)

   df_resid          ssr  df_diff     ss_diff         F        Pr(>F)
0     390.0  9385.915872      0.0         NaN       NaN           NaN
1     373.0  7101.888522     17.0  2284.02735  7.056468  6.662265e-15

# The full model is significantly better. So, mpg is a non-linear function of horsepower.